# download_rockland_raw_bids.py
#
# Authors: Daniel Clark, John Pellman 2015/2016, Carlos G.Candano 2017/2018

'''
This script downloads data from the NKI Rockland Sample Lite releases
stored in the cloud in BIDS format.  You can specify sex, age range, handedness,
session, scan type (anatomical, functional, dwi) and to limit your download to a subset of the sample.
If no options are specified, all available files are downloaded.
Use the '-h' to get more information about command line usage.
'''
# Visit List and Series. Increase the visit names here if upload new data
SESSIONS = ['NFB3', 'DS2', 'NFB2', 'NFBR2', 'CLG2', 'CLGR', 'CLG4', 'CLG2R', 'CLG3',
            'NFBR2A', 'CLG4R', 'NFB2R', 'DSA', 'CLGA', 'NFBA', 'CLG2A', 'CLG5', 'CLG',
            'NFBAR', 'CLG6', 'CLG5R', 'CLG6R',"ALGA","ALGAFU1","ALGAFU2"]
SCANS = ['anat', 'func', 'dwi', 'fmap']
# Mapping colloquial names for the series to BIDS names.
SERIES_MAP={'CHECKERBOARD1400':'task-CHECKERBOARD_acq-1400',
            'CHECKERBOARD645':'task-CHECKERBOARD_acq-645',
            'RESTCAP':'task-rest_acq-CAP',
            'REST1400':'task-rest_acq-1400',
            'BREATHHOLD1400':'task-BREATHHOLD_acq-1400',
            'REST645':'task-rest_acq-645',
            'RESTPCASL':'task-rest_pcasl',
            'DMNTRACKINGTEST':'task-DMNTRACKINGTEST',
            'DMNTRACKINGTRAIN':'task-DMNTRACKINGTRAIN',
            'MASK':'mask',
            'MSIT':'task-MSIT',
            'PEER1':'task-PEER1',
            'PEER2':'task-PEER2',
            'MORALDILEMMA':'task-MORALDILEMMA'}

def files(client, bucket, prefix=''):
    """ Return the path to the participants.tsv file in the bucket """
    paginator = client.get_paginator('list_objects')
    for result in paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='participants.tsv'):
        for prefix in result.get('CommonPrefixes', []):
            yield prefix.get('Prefix')
def generate_Subfolders(s3_client):
    """ Generates a list of the contents specified in the prefix """
    gen_subfolders = files(s3_client, 'fcp-indi', prefix=('data/Projects/RocklandSample/RawDataBIDS/'))#+extra
    genSubfoldersList=list(gen_subfolders)
    print(genSubfoldersList)
    return genSubfoldersList

# Main collect and download function
def collect_and_download(out_dir,
                         less_than=0, greater_than=0, sex='', handedness='',
                         sessions=SESSIONS,
                         scans=SCANS,
                         series=SERIES_MAP.keys(),
                         derivatives=False,
                         dryrun=False):
    '''
    Function to collect and download images from the Rockland sample 
    directory on FCP-INDI's S3 bucket
    Parameters
    ----------
    out_dir : string
        filepath to a local directory to save files to
    less_than : float
        upper age (years) threshold for participants of interest
    greater_than : float
        lower age (years) threshold for participants of interest
    sex : string
        'M' or 'F' to indicate whether to download male or female data
    handedness : string
        'R' or 'L' to indicate whether to download right-handed or
        left-handed participants
    sessions : list
        the session names (e.g.,'CLG5','NFB3')
    scan : list
        the scan types to download.  Can be 'anat','func','dwi' or 'fmap'.
    series : list
        the series to download (for functional scans)
    derivatives : boolean
        whether or not to download data derivatives for functional scans
    dryrun : boolean
        whether or not to perform a dry run (i.e., no actual downloads,
        just listing files that would be downloaded)
    Returns
    -------
    boolean
        Returns true if the download was successful, false otherwise.
    '''
    # Import packages
    import pandas
    import boto3
    import botocore
    # For anonymous access to the bucket.
    from botocore import UNSIGNED
    from botocore.client import Config
    from botocore.handlers import disable_signing

    # Init variables
    s3_bucket_name = 'fcp-indi'
    s3_prefix = 'data/Projects/RocklandSample/RawDataBIDS'

    # Fetch bucket
    s3 = boto3.resource('s3')
    s3.meta.client.meta.events.register('choose-signer.s3.*', disable_signing)	
    s3_bucket = s3.Bucket(s3_bucket_name)

    # Remove series that aren't in the series map keys.
    series = [ s for s in series if s in SERIES_MAP.keys() ]

    # If output path doesn't exist, create it
    if not os.path.exists(out_dir) and not dryrun:
        print ('Could not find %s, creating now...' % out_dir)
        os.makedirs(out_dir)

    # Dowload the participants.tsv file from S3
    s3_client = boto3.client('s3', config=Config(signature_version=UNSIGNED))
    # Calls function generate_subfolders that returns participants.tsv path in the bucket as a list
    path=generate_Subfolders(s3_client)   
    # paths removes the list elemnts of the path
    paths=path[0]
    # Assign path the participant tsv file name to open it (is in the current working dir)
    path=paths[(paths.index('partic')):]
    #open participants.tsv in the current working directory
    with open(path, 'wb') as f:
        s3_client.download_fileobj('fcp-indi', paths, f) 
    # Load participant as a Dataframe
    participants_df=pandas.read_csv("participants.tsv", delimiter='\t', na_values=['n/a'])
    # Init a list to store paths.
    print ('Collecting images of interest...')
    s3_keys = s3_bucket.objects.filter(Prefix=s3_prefix)
    #This has everything inside RawDataBIDS folder
    s3_keylist = [key.key for key in s3_keys]
    # Remove the participant rows from whose age range, handedness and sex do not conform to the criteria.
    if less_than:
        participants_df = participants_df[participants_df['age'] < less_than]
    if greater_than:
        participants_df = participants_df[participants_df['age'] > greater_than]
    if sex == 'M':
        participants_df = participants_df[participants_df['sex'] == 'MALE']
    elif sex == 'F':
        participants_df = participants_df[participants_df['sex'] == 'FEMALE']
    if handedness == 'R':
        participants_df = participants_df[participants_df['handedness'] == 'RIGHT']
    elif handedness == 'L':
        participants_df = participants_df[participants_df['handedness'] == 'LEFT']

    if len(participants_df) == 0:
        print ('No participants meet the criteria given.  No download will be initiated.')
        return

    # Generate a list of participants, single column list to filter on.
    participants_filt = ['sub-'+ label + '/' for label in participants_df['participant_id'].tolist()]

    # Generate a UNIQUE list of sessions to filter on.
    sessions_filt = ['ses-' + session + '/' for session in sessions]

    # Generate a list of series to filter on. (e.g. task-CHECKERBOARD..)
    series_filt = [SERIES_MAP[s] for s in series]

    # Fetch top-level JSONs first. List of paths to the unique Json files of all The Sample
    json_keylist = [key for key in s3_keylist for s in series_filt if s in key and 'json' in key and 'sub' not in key]   
    # remove the elements from keylist that doesn't contain a subject from participants_filt 
    s3_keylist = [key for key in s3_keylist for p in participants_filt if p in key]
    # Remove elements that dont have a valid session and from the unique sesssion list
    s3_keylist = [key for key in s3_keylist for s in sessions_filt if s in key]
    # Remove elements that do not have the scan type specified in scans (e.g. "anat,func, or dwi)    
    s3_keylist = [key for key in s3_keylist for s in scans if s in key]
    # Remove elements that doesnt have a series from the series filtered(ex. checkerboard, rest, etc)
    s3_keylist = [key for key in s3_keylist for s in series_filt if (s in key) or ('func' not in key) ]

    # append each of the Json paths to the end of s3_keylist. Around 13 sequences are added 
    s3_keylist.extend(json_keylist)
    # Append as well a CHANGE file, README and the dataset_desc json paths to the end of s3_keylist
    s3_keylist.append('/'.join([s3_prefix, 'CHANGES']))
    s3_keylist.append('/'.join([s3_prefix, 'README']))
    s3_keylist.append('/'.join([s3_prefix, 'dataset_description.json']))
    # Verify that the participants Datframe Has Only the Subjects that appear in the s3_keylist
    # NOT all participants have all the Scan Types.     
    newParticipantsDf=pandas.DataFrame(columns=participants_df.columns)
    for row in s3_keylist:
        rowDivided=row.split('/')
        #Get only the rows with sub-A000
        justTheSub=[x for x in rowDivided if x.startswith("sub-A") and len(x)==13]
        if len(justTheSub)>0: 
            # Remove the sub- part
            ursi=justTheSub[0][4:]
            particRow=participants_df[participants_df.iloc[:,0].str.contains(ursi)]
            #Build the dataframe and remove repeated rows
            newParticipantsDf=pandas.DataFrame.append(newParticipantsDf,particRow)
            newParticipantsDf=pandas.DataFrame.drop_duplicates(newParticipantsDf)
    participants_df = newParticipantsDf
    # Re-create the participants list after verifying the particpants that have 
    #The correct scan types. This corss checking with s3_keylist
    participants_filt = ['sub-'+ label + '/' for label in participants_df['participant_id'].tolist()]
    # And download the items. All the items are the Total number of rows in s3_keylist
    total_num_files = len(s3_keylist)
    files_downloaded = len(s3_keylist)
    # For each of the paths and each index in s3_keylist
    for path_idx, s3_path in enumerate(s3_keylist):
        print (s3_path)
        #Remove the string /data/Projects/RocklandSample/RawDataBIDS for each path in list
        rel_path = s3_path.replace(s3_prefix, '')
        # Remove the FIRST slash from string
        rel_path = rel_path.lstrip('/')
        # Create a location path  for the folder and file path
        download_file = os.path.join(out_dir, rel_path)
        download_dir = os.path.dirname(download_file)
        # Create the folder in the path specified
        if not os.path.exists(download_dir) and not dryrun:
            os.makedirs(download_dir)
        try:
            if not os.path.exists(download_file):
                # Dryrun will not download the files
                if dryrun:
                    print ('Would download to: %s' % download_file)
                else:
                    print ('Downloading to: %s' % download_file)
                    # Download the files in the just created folder
                    with open(download_file, 'wb') as f:
                        s3_client.download_fileobj(s3_bucket_name, s3_path, f)
                    print ('%.3f%% percent complete' % \
                          (100*(float(path_idx+1)/total_num_files)))
            else:
                print ('File %s already exists, skipping...') % download_file
                files_downloaded -= 1
        except Exception as exc:
            print ('There was a problem downloading %s.\n'\
                  'Check input arguments and try again.' % s3_path)
            print (exc)

    if dryrun:
        print ('%d files would be downloaded for %d participant(s).' % (files_downloaded,len(participants_df)))
    else:
        print ('%d files downloaded for %d participant(s).' % (files_downloaded,len(participants_df)))
    if not dryrun:
        print ('Saving out revised participants.tsv and session tsv files.')
        # Save out revised participants.tsv to output directory, if a participants.tsv already exists, open it and append it to the new one.
        if os.path.isfile(os.path.join(out_dir, 'participants.tsv')):
            old_participants_df = pandas.read_csv(os.path.join(out_dir, 'participants.tsv'), delimiter='\t', na_values=['n/a', 'N/A'])
            participants_df = participants_df.append(old_participants_df, ignore_index=True)
            participants_df.drop_duplicates(inplace=True)
            os.remove(os.path.join(out_dir, 'participants.tsv'))
        participants_df.to_csv(os.path.join(out_dir, 'participants.tsv'), sep="\t", na_rep="n/a", index=False)

        # Separate list for sessions TSVs.
        session_keylist = [key.key for key in s3_keys if 'sessions.tsv' in key.key]
        session_keylist = [key for key in session_keylist for p in participants_filt if p in key]
        # Save out revised session tsvs to output directory; if already exists, open it and merge with the new one.
        for session_key in session_keylist:
            participant = session_key.split('/')[-2]
            sessions_obj = s3_client.get_object(Bucket=s3_bucket_name, Key=session_key )
            sessions_df = pandas.read_csv(sessions_obj['Body'], delimiter='\t', na_values=['n/a'])
            # Drop all sessions not in specified.
            sessions_df = sessions_df[sessions_df['session_id'].isin(sessions_filt)]
            # Save out revised sessions tsv to output directory, if a sessions tsv already exists, open it and append it to the new one.
            if os.path.isfile(os.path.join(out_dir, participant, participant+'_sessions.tsv')):
                old_sessions_df = pandas.read_csv(os.path.join(out_dir, participant, participant+'_sessions.tsv'), delimiter='\t', na_values=['n/a', 'N/A'])
                sessions_df = sessions_df.append(old_sessions_df, ignore_index=True)
                sessions_df.drop_duplicates(inplace=True)
                os.remove(os.path.join(out_dir, participant, participant+'_sessions.tsv'))
            sessions_df.to_csv(os.path.join(out_dir, participant, participant+'_sessions.tsv'), sep="\t", na_rep="n/a", index=False)
    print ('Done!')

# Make module executable
if __name__ == '__main__':
    # Import packages
    import argparse
    import sys
    import os
    
    # Init arparser
    parser = argparse.ArgumentParser(description=__doc__)
    
    # Required arguments
    parser.add_argument('-o', '--out_dir', required=True, type=str,
                        help='Path to local folder to download files to')

    # Optional arguments
    parser.add_argument('-lt', '--less_than', required=False,
                        type=float, help='Upper age threshold (in years) of '\
                                         'particpants to download (e.g. for '\
                                         'subjects 30 or younger, \'-lt 31\')')
    parser.add_argument('-gt', '--greater_than', required=False,
                        type=float, help='Lower age threshold (in years) of '\
                                       'particpants to download (e.g. for '\
                                       'subjects 31 or older, \'-gt 30\')')
    parser.add_argument('-x', '--sex', required=False, type=str,
                        help='Participant sex of interest to download only '\
                             '(e.g. \'M\' or \'F\')')
    parser.add_argument('-m', '--handedness', required=False, type=str,
                        help='Participant handedness to download only '\
                             '(e.g. \'R\' or \'L\')')
    parser.add_argument('-v', '--sessions', required=False, nargs='*', type=str,
                        help='A space-separated list of session (visit) codes '\
                             'to download (e.g. \'NFB3\',\'CLG2\')')
    parser.add_argument('-t', '--scans', required=False, nargs='*', type=str,
                        help='A space-separated list of scan types '\
                             'to download (e.g. \'anat\',\'dwi\')')
    parser.add_argument('-e', '--series', required=False, nargs='*', type=str,
                        help='A space-separated list of series codes '\
                             'to download (e.g. \'DMNTRACKINGTRAIN\',\'DMNTRACKINGTEST\')')
    parser.add_argument('-d', '--derivatives', required=False, action='store_true',
                        help='Download derivatives (despiked physio, masks) in addition to raw data?')
    parser.add_argument('-n', '--dryrun', required=False, action='store_true',
                        help='Perform a dry run to see how many files would be downloaded.')

    # Parse and gather arguments
    args = parser.parse_args()
    #########
    '''
    class Namespace:
        def __init__(self, **kwargs):
            self.__dict__.update(kwargs)
    args=Namespace(out_dir='/home/cgutierrez/RunScripts/down_rock', less_than=7, greater_than=None,sex='M', handedness=None,sessions=None, scans=['func'], series=["CHECKERBOARD645","REST1400"], dryrun=False, derivatives=None)     
    ##########
    '''
    # Init variables
    out_dir = os.path.abspath(args.out_dir)
    kwargs = {}
    if args.less_than:
        kwargs['less_than'] = args.less_than
        print ('Using upper age threshold of %d...' % kwargs['less_than'])
    else:
        print ('No upper age threshold specified')
    if args.greater_than:
        kwargs['greater_than'] = args.greater_than
        print ('Using lower age threshold of %d...' % kwargs['greater_than'])
    else:
        print ('No lower age threshold specified')
    if args.sex:
        kwargs['sex'] = args.sex.upper()
        if kwargs['sex'] == 'M':
            print ('Downloading only male participants...')
        elif kwargs['sex'] == 'F':
            print ('Downloading only female participants...')
        else:
            print ('Input for sex \'%s\' was not \'M\' or \'F\'.' % kwargs['sex'])
            print ('Please check script syntax and try again.')
            sys.exit(1)
    else:
        print ('No sex specified, using all sexes...')
    if args.handedness:
        kwargs['handedness'] = args.handedness.upper()
        if kwargs['handedness'] == 'R':
            print ('Downloading only right-handed participants...')
        elif kwargs['handedness'] == 'L':
            print ('Downloading only left-handed participants...')
        else:
            print ('Input for handedness \'%s\' was not \'L\' or \'R\'.' % kwargs['handedness'])
            print ('Please check script syntax and try again.')
            sys.exit(1)
    if args.sessions:
        kwargs['sessions'] = args.sessions
        for session in kwargs['sessions']:
            if session not in SESSIONS:
                print ('Session \'%s\' is not a valid session name.' % session)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Sessions to download: ' + ' '.join(kwargs['sessions']))
    if args.scans:
        kwargs['scans'] = args.scans
        for scan in kwargs['scans']:
            if scan not in SCANS:
                print ('Scan \'%s\' is not a valid scan name.' % scan)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Scans to download: ' + ' '.join(kwargs['scans']))
    if args.series:
        kwargs['series'] = args.series
        for series in kwargs['series']:
            if series not in SERIES_MAP.keys():
                print ('Series \'%s\' is not a valid series name.' % series)
                print ('Please check script syntax and try again.')
                sys.exit(1)
        print ('Series to download: ' + ' '.join(kwargs['series']))
    if args.derivatives:
        kwargs['derivatives'] = args.derivatives
        print ('Data derivatives will be downloaded.')
    if args.dryrun:
        kwargs['dryrun'] = args.dryrun
        print ('Running download as a dry run.')

    # Call the collect and download routine
    collect_and_download(out_dir, **kwargs)